{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5b1d5fb5-89a4-4812-8eba-b7748ba4d347",
   "metadata": {},
   "source": [
    "# Lesson 2: Loading and preparing data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1549545a-43bd-461b-897f-fdf623bf7e4a",
   "metadata": {},
   "source": [
    "# Loading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b666824e-a940-484f-b3f0-6b7682ef4677",
   "metadata": {
    "height": 30
   },
   "outputs": [],
   "source": [
    "import \"dotenv/config\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bee5413-fcde-4dd9-b5fb-89023203cfb2",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "import { GithubRepoLoader } from \"langchain/document_loaders/web/github\";\n",
    "// Peer dependency, used to support .gitignore syntax\n",
    "import ignore from \"ignore\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3e86ff7-6368-4442-8ee6-0a1a3c1b35e0",
   "metadata": {
    "height": 97
   },
   "outputs": [],
   "source": [
    "// Will not include anything under \"ignorePaths\"\n",
    "const loader = new GithubRepoLoader(\n",
    "  \"https://github.com/langchain-ai/langchainjs\",\n",
    "  { recursive: false, ignorePaths: [\"*.md\", \"yarn.lock\"] }\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "634988ed-0a57-49bc-9261-dc19fe66529b",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "const docs = await loader.load();\n",
    "\n",
    "console.log(docs.slice(0, 3));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b997a07f-bfb0-407b-8959-fc0c467bd30f",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "// Peer dependency\n",
    "import * as parse from \"pdf-parse\";\n",
    "import { PDFLoader } from \"langchain/document_loaders/fs/pdf\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cd470c94-5bec-4efe-8395-416b0a408f8b",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "const loader = new PDFLoader(\"./data/MachineLearning-Lecture01.pdf\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4476713f-5285-4d2f-9126-d30ee9825090",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "const rawCS229Docs = await loader.load();\n",
    "\n",
    "console.log(rawCS229Docs.slice(0, 5));"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca1a583f-c40c-43fa-a554-d0dbecfd543a",
   "metadata": {},
   "source": [
    "# Splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2581cfc-aec6-4600-860c-1e100d42a382",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "import { RecursiveCharacterTextSplitter } from \"langchain/text_splitter\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10a566f9-4c8e-4199-a394-f52baaceac4d",
   "metadata": {
    "height": 97
   },
   "outputs": [],
   "source": [
    "const splitter = RecursiveCharacterTextSplitter.fromLanguage(\"js\", {\n",
    "  chunkSize: 32,\n",
    "  chunkOverlap: 0,\n",
    "});"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6da99c5b-08d1-4f6c-a4dd-aed004b6e023",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "const code = `function helloWorld() {\n",
    "console.log(\"Hello, World!\");\n",
    "}\n",
    "// Call the function\n",
    "helloWorld();`;\n",
    "\n",
    "await splitter.splitText(code);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9699b4e1-a46c-44e2-8f13-2f6ff62f49e9",
   "metadata": {
    "height": 182
   },
   "outputs": [],
   "source": [
    "import { CharacterTextSplitter } from \"langchain/text_splitter\";\n",
    "\n",
    "const splitter = new CharacterTextSplitter({\n",
    "  chunkSize: 32,\n",
    "  chunkOverlap: 0,\n",
    "  separator: \" \"\n",
    "});\n",
    "\n",
    "await splitter.splitText(code);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8219b6ca-bf2d-4471-91f8-b3e6870ce07e",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "const splitter = RecursiveCharacterTextSplitter.fromLanguage(\"js\", {\n",
    "  chunkSize: 64,\n",
    "  chunkOverlap: 32,\n",
    "});\n",
    "\n",
    "await splitter.splitText(code);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66d091de-c9a8-429f-8bda-d43b489cdace",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "const splitter = new RecursiveCharacterTextSplitter({\n",
    "  chunkSize: 512,\n",
    "  chunkOverlap: 64,\n",
    "});"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0ce99cb-ab2f-4443-93c8-a5f7c979de71",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "const splitDocs = await splitter.splitDocuments(rawCS229Docs);\n",
    "\n",
    "console.log(splitDocs.slice(0, 5));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7be9d41f-64c2-4839-a8fd-c2f32a33c5dc",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5e32044-6e84-470a-91a7-262918d5a22b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d678f1d-c37c-462f-96ba-d1c84ab35482",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18f6254b-5eaf-4867-999b-97d08f96cee9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c326a7cc-d9be-432c-be10-7846d2b8e5e9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b25e21c-28d0-4329-852e-b96a14c38529",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "520b4a9b-89de-4652-b238-b7dddbd5a2c2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c325c8b5-9292-41f2-ad10-3ad0ad3df182",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5fe1d4c-9075-47d4-8451-b9bbc1115668",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cb3853d-826c-4e1f-a413-9f9f0faf62fe",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3dd60bd2-fcc1-4f96-8763-969e30d52c13",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8f81c9f-88d7-470a-b597-c4137c9d35a4",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8e56d1e-17e2-4c11-984a-da63e0d13e55",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Deno",
   "language": "typescript",
   "name": "deno"
  },
  "language_info": {
   "file_extension": ".ts",
   "mimetype": "text/x.typescript",
   "name": "typescript",
   "nb_converter": "script",
   "pygments_lexer": "typescript",
   "version": "5.3.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
